In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
from IPython.display import display_html, HTML
import urllib
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import re

letters = re.compile('[a-zA-Z]')

def df_to_html(df):
    display_html(HTML(df.to_html()))

def load_gutenberg_book(url, char_limit=10000, min_len_of_sections=40):
    """
    Returns a list of paragraphs in the book.
    
    url: A url from Project Gutenberg.
    char_limit: Amount of characters of the book to read.
    min_len_of_sections: Each paragraph must be at least this many characters long.
    """
    book = urllib.urlopen(url)
    book_text = book.read(char_limit if char_limit else -1)
    
    result = []
    for text in book_text[:char_limit].split("\r\n\r\n"):
        if len(text) >= min_len_of_sections:
            clean_text = text.replace("\r\n", " ").strip()
            result.append(clean_text)
    
    start_position = len(result) if len(result) < 6 else 6
    return result[start_position:]

def get_text(path):
    """
    Handle all the weird ways books are encoded.
    """
    encoding_options = "ascii utf-8 utf-16 utf-32 utf-16-be utf-16-le utf-32-be utf-32-le".split()
    
    for encoding in encoding_options:
        try:
            with open(path, encoding=encoding) as book:
                return book.read()
        except UnicodeDecodeError:
            continue
    raise ValueError

def extract_term(term_indicator, text, default=None, max_term_size=75):
    term_start = text.find(term_indicator)
    # If not found, return default.
    if term_start == -1:
        term = default
    else:
        term_end = text.find("\n", term_start)
        term = text[term_start+len(term_indicator):term_end].strip()
    if term and (len(term) > max_term_size):
        term = default
    return term

def get_author_and_title(book_text, title_case=True):
    title = extract_term("Title:", book_text, default=None)
    author = extract_term("Author:", book_text, default=None)
    # Solve for other strange author name formatting
    for term_indicator in ["\n\nby ", "\n\nOF ", "\nOF\n"]:
        if author is None:
            author = extract_term(term_indicator, book_text[:15000], max_term_size=25)
    if title_case and title and author:
        title, author = title.title(), author.title()
    return title, author

def locate_beginning_of_text(title, author, text):
    location = text.find("START OF THIS PROJECT GUTENBERG") + 20
    
    if location < 0:
        if title:
            location = text.find(title)
        if author:
            location = text.find(author)
            
    return location

def locate_end_of_text(text):
    f = text.find
    
    search_terms = ["End of Project Gutenberg",
                    "END OF THIS PROJECT GUTENBERG EBOOK",
                    "END OF THE PROJECT GUTENBERG EBOOK",
                    "End of the Project Gutenberg Etext"]
    
    location = max([f(term) for term in search_terms])
    if location < 0:
        print("Fail")
        location = None
    return location

def parse_book(book_text, min_paragraph_characters=100):
    """
    Given the text of a book, returns a list of dictionaries with the keys:
    {title, author, contents, part, hash}
    """
    parsed_book_paragraphs = []
    title, author = get_author_and_title(book_text)
    text_starts = locate_beginning_of_text(title, author, book_text)
    text_ends = locate_end_of_text(book_text)
    book_paragraphs = book_text[text_starts:text_ends].split("\n\n")
    for paragraph_number, raw_paragraph in enumerate(book_paragraphs):
        paragraph = raw_paragraph.replace("\n", " ").strip()
        if (len(paragraph) < min_paragraph_characters) or not re.search(letters, paragraph):
            continue
        if "gutenberg" in paragraph.lower() or "chapter" in paragraph.lower():
            continue
        book_data = {"title": title,
                     "author": author,
                     "contents": paragraph,
                     "part": paragraph_number}
        parsed_book_paragraphs.append(book_data)
    return parsed_book_paragraphs            

def get_list_of_book_paths(book_directory):
    return list(glob.iglob(book_directory + '/*.txt'))

def books_to_pandas(book_directory, min_paragraph_characters=100):
    paragraphs = []

    for filename in get_list_of_book_paths(book_directory):
        book_text = get_text(filename)
        parsed_book = parse_book(book_text, min_paragraph_characters)
        paragraphs.extend(parsed_book)
    
    return pd.DataFrame(paragraphs)

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def cosine_similarity(new_docs, old_docs):
    """
    Returns a similarity matrix where the first row is an array of
    similarities of the first new_doc compared with each of the old
    docs.
    """
    return new_docs*old_docs.T

def find_closest_matches(similarity_matrix, n_matches_to_return=1):
    """
    Expects a dense array of the form [[1., .5, .2],
                                       [.3, 1., .1],
                                       [.2, .4, 1.]]
    where rows correspond to similarities.
    """
    top_indices = np.apply_along_axis(func1d=lambda x: x.argsort()[-n_matches_to_return:][::-1], 
                                      axis=1, 
                                      arr=similarity_matrix)
    return top_indices

simple_cache = {}

def search_book(paragraph, book_title, books, n_results=10, print_results=False, return_title=False):
    book_title_list = book_title if isinstance(book_title, (list, tuple)) else (book_title,)
    select_books = books[books.title.isin(book_title_list)].reset_index()
    contents = select_books.contents

    if book_title not in simple_cache:
        vectorizer = TfidfVectorizer(max_df=.7, min_df=.0001, tokenizer=LemmaTokenizer()).fit(contents)
        simple_cache[book_title] = {"vectorizer": vectorizer,
                                    "vect_book": vectorizer.transform(contents)}

    vectorizer = simple_cache[book_title]["vectorizer"]
    vect_book = simple_cache[book_title]["vect_book"]
    vect_paragraph = vectorizer.transform([paragraph])

    nbrs = NearestNeighbors(n_neighbors=n_results, algorithm='brute').fit(vect_book)

    distances, indices = nbrs.kneighbors(vect_paragraph)

    search_results = list(zip(distances[0], select_books.ix[indices[0]].contents, select_books.ix[indices[0]].title))
    
    if print_results:
        for dist, text, title in search_results:
            print(dist)
            print(text)
            print("\n")
    
    if return_title:
        return search_results
    return [(dist, text) for dist, text, title in search_results]

def compare_book_paragraphs(book_title, books, n_close_matches=10, same_book_in_corpus=True):
    results = []
    book_title_list = book_title if isinstance(book_title, (list, tuple)) else (book_title,)
    select_books = books[books.title.isin(book_title_list)].reset_index()
    
    for paragraph in select_books.contents:
        if same_book_in_corpus:
            result = search_book(paragraph, book_title, books, n_results=2)[1]
        else:
            result = search_book(paragraph, book_title, books, n_results=1)[0]
        results.append([paragraph] + list(result))

    df = pd.DataFrame(results, columns=["Text 1", "Distance", "Text 2"])
    df.sort_values("Distance", inplace=True)
    
    print("Perfect matches")
    perfect_matches = df[df.Distance == 0].drop_duplicates()
    df_to_html(perfect_matches)
    print("\n")

    print ("Close matches")
    top_close_matches = df[df.Distance != 0].drop_duplicates("Distance").head(n_close_matches)
    df_to_html(top_close_matches)
    
def compare_book_to_books(book_title, other_book_titles, books, n_close_matches=20):
    results = []
    
    if book_title in other_book_titles:
        other_book_titles = tuple([title for title in other_book_titles if title != book_title])
    
    select_books = books[books.title.isin(other_book_titles)].reset_index()
    book = books[books.title == book_title].reset_index()
    
    for paragraph in book.contents:
        result = search_book(paragraph, other_book_titles, books, n_results=1, return_title=True)[0]
        results.append([paragraph] + list(result))

    df = pd.DataFrame(results, columns=["Text 1", "Distance", "Text 2", "Title"])
    df.sort_values("Distance", inplace=True)
    
    print("Perfect matches")
    perfect_matches = df[df.Distance == 0].drop_duplicates()
    df_to_html(perfect_matches)
    print("\n")

    print ("Close matches")
    close_matches = df[df.Distance != 0]
    df_to_html(close_matches.drop_duplicates("Distance").head(n_close_matches))
    
    return close_matches

In [2]:
books = books_to_pandas("popular_books", min_paragraph_characters=1)

In [3]:
books.head()


Out[3]:
author contents part title
0 Alexandre Dumas, Pere THE COUNT OF MONTE CRISTO 2 The Count Of Monte Cristo
1 Alexandre Dumas, Pere by Alexandre Dumas, Pere 3 The Count Of Monte Cristo
2 Alexandre Dumas, Pere On the 24th of February, 1815, the look-out at... 6 The Count Of Monte Cristo
3 Alexandre Dumas, Pere As usual, a pilot put off immediately, and rou... 7 The Count Of Monte Cristo
4 Alexandre Dumas, Pere Immediately, and according to custom, the ramp... 8 The Count Of Monte Cristo

In [4]:
books.title.value_counts()


Out[4]:
War And Peace                                11373
The Count Of Monte Cristo                    11211
The Three Musketeers                          8206
The Complete Works Of William Shakespeare     6351
The Man In The Iron Mask                      5404
The Works Of Edgar Allan Poe                  4836
Notre-Dame De Paris                           4025
Great Expectations                            3834
Ben-Hur                                       3572
A Tale Of Two Cities                          3315
The Adventures Of Sherlock Holmes             2540
Moby Dick; Or The Whale                       2485
A Journey To The Centre Of The Earth          2446
Adventures Of Huckleberry Finn, Complete      2386
The Phantom Of The Opera                      2340
Emma                                          2320
The Iliad Of Homer                            2199
Leviathan                                     2104
Dracula                                       2070
Pride And Prejudice                           2060
The Adventures Of Tom Sawyer, Complete        2028
Wuthering Heights                             1906
Sense And Sensibility                         1813
Around The World In 80 Days                   1681
Peter Pan                                     1659
Robin Hood                                    1625
The Picture Of Dorian Gray                    1501
Beowulf                                       1454
Treasure Island                               1437
Dorothy And The Wizard In Oz                  1329
The Invisible Man                             1144
The Wonderful Wizard Of Oz                    1139
The Jungle Book                                979
Grimms' Fairy Tales                            978
The Importance Of Being Earnest                960
Through The Looking-Glass                      938
The War Of The Worlds                          915
A Study In Scarlet                             825
Alice'S Adventures In Wonderland               798
The Scarlet Letter                             791
The Island Of Doctor Moreau                    759
Frankenstein                                   700
Youth                                          517
Dr. Jekyll And Mr. Hyde                        430
The Time Machine                               322
The Yellow Wallpaper                           269
Heart Of Darkness                              204
Metamorphosis                                  103
A Modest Proposal                               37
Name: title, dtype: int64

Search for one paragraph in one book

Number of comparisons ~ 5000


In [5]:
%%time
paragraph = "alice doesn't know which way to go"
book_title = "Alice'S Adventures In Wonderland"

search_book(paragraph, book_title, books, n_results=5, print_results=True)


0.872256370575
'Then it doesn't matter which way you go,' said the Cat.


1.14624113894
She ate a little bit, and said anxiously to herself, 'Which way? Which way?', holding her hand on the top of her head to feel which way it was growing, and she was quite surprised to find that she remained the same size: to be sure, this generally happens when one eats cake, but Alice had got so much into the way of expecting nothing but out-of-the-way things to happen, that it seemed quite dull and stupid for life to go on in the common way.


1.15233505399
'It goes on, you know,' the Hatter continued, 'in this way:--


1.22691887913
'Oh, I'm not particular as to size,' Alice hastily replied; 'only one doesn't like changing so often, you know.'


1.2269301777
'I DON'T know,' said the Caterpillar.


Wall time: 4.72 s

In [6]:
%%time
paragraph = "queen says off with his or her head"
book_title = "Alice'S Adventures In Wonderland"

search_book(paragraph, book_title, books, n_results=5, print_results=True)


1.12373165041
'Are their heads off?' shouted the Queen.


1.16790123046
All the time they were playing the Queen never left off quarrelling with the other players, and shouting 'Off with his head!' or 'Off with her head!' Those whom she sentenced were taken into custody by the soldiers, who of course had to leave off being arches to do this, so that by the end of half an hour or so there were no arches left, and all the players, except the King, the Queen, and Alice, were in custody and under sentence of execution.


1.16958287991
The Queen turned crimson with fury, and, after glaring at her for a moment like a wild beast, screamed 'Off with her head! Off--'


1.17078483868
The Queen had only one way of settling all difficulties, great or small. 'Off with his head!' she said, without even looking round.


1.17398804409
'Off with her head!' the Queen shouted at the top of her voice. Nobody moved.


Wall time: 16 ms

Compare all paragraphs in one book

Number of comparisons ~ 25 million


In [7]:
# See the entire string when printing a data frame
pd.set_option('display.max_colwidth', -1)

In [8]:
%%time
compare_book_paragraphs(book_title, books)


Perfect matches
Text 1 Distance Text 2
260 'Wow! wow! wow!' 0 'Wow! wow! wow!'
254 CHORUS. 0 CHORUS.

Close matches
Text 1 Distance Text 2
592 Will you, won't you, will you, won't you, will you join the dance? Will you, won't you, will you, won't you, won't you join the dance? 0.085699 Will you, won't you, will you, won't you, will you join the dance? Will you, won't you, will you, won't you, won't you join the dance?"'
647 'Beautiful Soup! Who cares for fish, Game, or any other dish? Who would not give all else for two Pennyworth only of beautiful Soup? Pennyworth only of beautiful Soup? Beau--ootiful Soo--oop! Beau--ootiful Soo--oop! Soo--oop of the e--e--evening, Beautiful, beauti--FUL SOUP!' 0.581810 'Beautiful Soup, so rich and green, Waiting in a hot tureen! Who for such dainties would not stoop? Soup of the evening, beautiful Soup! Soup of the evening, beautiful Soup! Beau--ootiful Soo--oop! Beau--ootiful Soo--oop! Soo--oop of the e--e--evening, Beautiful, beautiful Soup!
183 'I DON'T know,' said the Caterpillar. 0.651287 'I don't see,' said the Caterpillar.
154 'You!' said the Caterpillar contemptuously. 'Who are YOU?' 0.707552 'Who are YOU?' said the Caterpillar.
734 'Nothing whatever,' said Alice. 0.748197 'Nothing,' said Alice.
733 'Nothing WHATEVER?' persisted the King. 0.754383 'Nothing whatever,' said Alice.
651 'Soo--oop of the e--e--evening, Beautiful, beautiful Soup!' 0.775282 'Beautiful Soup, so rich and green, Waiting in a hot tureen! Who for such dainties would not stoop? Soup of the evening, beautiful Soup! Soup of the evening, beautiful Soup! Beau--ootiful Soo--oop! Beau--ootiful Soo--oop! Soo--oop of the e--e--evening, Beautiful, beautiful Soup!
315 'Not the same thing a bit!' said the Hatter. 'You might just as well say that "I see what I eat" is the same thing as "I eat what I see"!' 0.815753 'You might just as well say,' added the March Hare, 'that "I like what I get" is the same thing as "I get what I like"!'
554 'What was THAT like?' said Alice. 0.822642 'What for?' said Alice.
678 'Give your evidence,' said the King; 'and don't be nervous, or I'll have you executed on the spot.' 0.846233 'Give your evidence,' the King repeated angrily, 'or I'll have you executed, whether you're nervous or not.'
Wall time: 12.5 s

Compare all paragraphs in one book to all books

Number of comparisons ~ 500 million


In [9]:
all_book_titles = books.title.unique().tolist()

In [10]:
%%time
book_title = "Alice'S Adventures In Wonderland"
close_matches = compare_book_to_books(book_title, all_book_titles, books)


Perfect matches
Text 1 Distance Text 2 Title
797 THE END 0 THE END Dracula
259 CHORUS. 0 Chorus. The Complete Works Of William Shakespeare
260 'Wow! wow! wow!' 0 'Tu whu! Tu whu! Tu whu!' Grimms' Fairy Tales
1 Lewis Carroll 0 LEWIS WALLACE Ben-Hur
2 THE MILLENNIUM FULCRUM EDITION 3.0 0 The Millennium Fulcrum Edition 1.7 Through The Looking-Glass

Close matches
Text 1 Distance Text 2 Title
338 'I don't know what you mean,' said Alice. 0.511965 'I don't know what you mean by "glory,"' Alice said. Through The Looking-Glass
586 'Very much indeed,' said Alice. 0.542440 'Very much indeed,' Alice said politely. Through The Looking-Glass
542 'Certainly not!' said Alice indignantly. 0.654421 'Certainly,' said Alice. Through The Looking-Glass
612 'And what are they made of?' Alice asked in a tone of great curiosity. 0.658161 'But what are they for?' Alice asked in a tone of great curiosity. Through The Looking-Glass
391 'Of course they were', said the Dormouse; '--well in.' 0.693690 'Of course,' was my answer; 'of course we are.' Wuthering Heights
786 'I won't!' said Alice. 0.707242 'I see you don't,' said Alice. Through The Looking-Glass
271 'I don't much care where--' said Alice. 0.721446 'I see you don't,' said Alice. Through The Looking-Glass
592 Will you, won't you, will you, won't you, will you join the dance? Will you, won't you, will you, won't you, won't you join the dance? 0.728049 "You won't, won't you?  Well, I sh'd _reckon_ you won't!" Adventures Of Huckleberry Finn, Complete
596 Will you, won't you, will you, won't you, will you join the dance? Will you, won't you, will you, won't you, won't you join the dance?"' 0.728398 "You won't, won't you?  Well, I sh'd _reckon_ you won't!" Adventures Of Huckleberry Finn, Complete
435 'Yes!' shouted Alice. 0.733234 'Yes, if you like,' said Alice. Through The Looking-Glass
749 'What's in it?' said the Queen. 0.742012 'What did he want?' said the Red Queen. Through The Looking-Glass
604 'Thank you,' said Alice, 'it's very interesting. I never knew so much about a whiting before.' 0.742622 'Thank you very much,' said Alice. Through The Looking-Glass
752 'Who is it directed to?' said one of the jurymen. 0.760924 'Who is there?' Grimms' Fairy Tales
283 'I suppose so,' said Alice. 0.785349 'I see you don't,' said Alice. Through The Looking-Glass
674 'It isn't mine,' said the Hatter. 0.786595 'It was.' Wuthering Heights
651 'Soo--oop of the e--e--evening, Beautiful, beautiful Soup!' 0.793615 Teneri sdegni, e placide, e tranquille Repulse, e cari vezzi, e liete paci, Sorrisi, parolette, e dolci stille Di pianto, e sospir tronchi, e molli baci." The Iliad Of Homer
0 ALICE'S ADVENTURES IN WONDERLAND 0.808307 ADVENTURES Adventures Of Huckleberry Finn, Complete
717 'Shan't,' said the cook. 0.809247 "And cooks?" The Count Of Monte Cristo
183 'I DON'T know,' said the Caterpillar. 0.809791 'I don't know,' I said. Wuthering Heights
340 'Perhaps not,' Alice cautiously replied: 'but I know I have to beat time when I learn music.' 0.809837 'Perhaps it would,' Alice replied cautiously. Through The Looking-Glass
Wall time: 8min 45s

In [11]:
close_matches[:100].Title.value_counts()


Out[11]:
Through The Looking-Glass                   57
Wuthering Heights                           15
The Man In The Iron Mask                    4 
The Adventures Of Sherlock Holmes           4 
Adventures Of Huckleberry Finn, Complete    3 
The Count Of Monte Cristo                   2 
A Study In Scarlet                          2 
The Works Of Edgar Allan Poe                2 
Dracula                                     1 
Grimms' Fairy Tales                         1 
The Wonderful Wizard Of Oz                  1 
A Tale Of Two Cities                        1 
Pride And Prejudice                         1 
Robin Hood                                  1 
The Phantom Of The Opera                    1 
The Iliad Of Homer                          1 
Notre-Dame De Paris                         1 
Great Expectations                          1 
Name: Title, dtype: int64

In [12]:
%%time
book_title = "The Adventures Of Sherlock Holmes"
close_matches = compare_book_to_books(book_title, all_book_titles, books)


Perfect matches
Text 1 Distance Text 2 Title
1180 "Certainly." 0 "Certainly." The Count Of Monte Cristo
124 "It was." 0 "It was." The Count Of Monte Cristo
1413 "Yes, sir." 0 "Yes, sir." The Count Of Monte Cristo
389 "Yes." 0 "Yes." The Count Of Monte Cristo
1465 "What, then?" 0 "What then?" The Count Of Monte Cristo
936 "Entirely." 0 "Entirely." The Count Of Monte Cristo
1501 "Always." 0 "Always." The Count Of Monte Cristo
1502 "And why?" 0 "And why?" The Count Of Monte Cristo
916 "Ah!" 0 "Ah!" The Man In The Iron Mask
1524 "Yes, all." 0 "Yes, all." The Three Musketeers
103 "And Irene Adler?" 0 "And _Jim_?" Adventures Of Huckleberry Finn, Complete
1534 "By no means." 0 "By no means." The Man In The Iron Mask
99 "But how?" 0 "But how?" The Three Musketeers
1380 "Well?" 0 "Well?" The Count Of Monte Cristo
911 "Nothing." 0 "Nothing." The Count Of Monte Cristo
1584 "So it appears." 0 "It appears so." The Three Musketeers
222 III. 0 III. The Works Of Edgar Allan Poe
821 "What?" 0 "What?" The Count Of Monte Cristo
225 "Not yet." 0 "Not yet." Great Expectations
369 "'Yes.' 0 "'Yes.' A Study In Scarlet
548 "None." 0 "None." The Count Of Monte Cristo
1668 "No." 0 "No." The Count Of Monte Cristo
1685 I nodded again. 0 I nodded again. The Works Of Edgar Allan Poe
188 "What then?" 0 "What then?" The Count Of Monte Cristo
126 II. 0 II. The Works Of Edgar Allan Poe
965 "It is possible." 0 "It is possible." The Count Of Monte Cristo
130 "What is it?" 0 "What is it?" The Count Of Monte Cristo
1014 "Yes?" 0 "Yes?" The Count Of Monte Cristo
1067 "Where is it, then?" 0 "Where is it, then?" Adventures Of Huckleberry Finn, Complete
1002 "What do you mean?" 0 "What do you mean?" The Count Of Monte Cristo
999 "Well." 0 "Well." The Count Of Monte Cristo
521 "No?" 0 "No?" A Tale Of Two Cities
174 "Precisely." 0 "Precisely." The Count Of Monte Cristo
1117 "I do." 0 "I do." The Count Of Monte Cristo
1154 "Very much so." 0 "Very much so." Around The World In 80 Days
155 "Not in the least." 0 "Not in the least." The Count Of Monte Cristo
1172 "Never." 0 "Never." The Count Of Monte Cristo
1338 "By all means." 0 "By all means." The Count Of Monte Cristo
791 "BALLARAT." 0 "Oliva Corsinari." The Count Of Monte Cristo
1786 "'Entirely.' 0 "'Signed El-Kobbir.' The Count Of Monte Cristo
1016 "What will you do, then?" 0 "What will you do, then?" The Man In The Iron Mask
2143 "'Ample.' 0 "'Signed El-Kobbir.' The Count Of Monte Cristo
2243 "Do you know him?" 0 "Do you know him?" Great Expectations
2281 "Where to?" 0 "Where to?" The Count Of Monte Cristo
582 "Of what?" 0 "Of what?" The Three Musketeers
751 "And that is--" 0 "And that is--" Around The World In 80 Days
1943 "Pray do so." 0 "Pray do so." The Count Of Monte Cristo
21 "How often?" 0 "How often?" Great Expectations
2049 "And how?" 0 "And how?" Dracula
719 "Yes, certainly." 0 "Yes, certainly." The Count Of Monte Cristo
714 "The doctor?" 0 "The doctor?" The Count Of Monte Cristo
651 "What are they?" 0 "What are they?" The Count Of Monte Cristo
1 by 0 by The Phantom Of The Opera
5 I. 0 I. The Works Of Edgar Allan Poe
1873 "Dr. Becher's." 0 Merrick's "Tryphiodorus," 148, 99. The Iliad Of Homer

Close matches
Text 1 Distance Text 2 Title
704 "I think that it is very probable." 0.204008 "I think that is very probable." A Journey To The Centre Of The Earth
415 "And what did you see?" 0.220975 "What did you see?" The Phantom Of The Opera
1642 "But what will you do?" 0.272276 "But you, what will you do?" The Man In The Iron Mask
202 "You have the photograph?" 0.282313 "Have you?" Great Expectations
2492 "Yes, the wine-cellar." 0.284365 "Yes." The Count Of Monte Cristo
163 "And what then?" 0.285819 "What then?" The Count Of Monte Cristo
1924 "And is that all?" 0.289264 "Is that all?" The Man In The Iron Mask
1254 "What, then, did Peterson do?" 0.299098 "What did he do, then?" The Man In The Iron Mask
402 "What are you going to do, then?" I asked. 0.311689 "What are you going to do?" I asked. Dracula
135 "And what of Irene Adler?" I asked. 0.339803 "What?" I asked. None
115 "Absolutely?" 0.340286 "Absolutely." The Count Of Monte Cristo
1146 "That is possible." 0.352484 "Is that possible?" The Count Of Monte Cristo
1132 "How can you tell that?" 0.355596 "How can I tell you?" The Count Of Monte Cristo
807 "And the cigar-holder?" 0.364676 "And _Jim_?" Adventures Of Huckleberry Finn, Complete
106 "I am sure." 0.367847 "I am sure of it." The Count Of Monte Cristo
2346 "It is half-past ten now." 0.378988 It was now half-past ten. Around The World In 80 Days
991 "To the police?" 0.379647 "The police." The Count Of Monte Cristo
235 "But to whom?" 0.381023 "To whom?" The Count Of Monte Cristo
797 "Quite so." 0.385039 "Not quite so." The Man In The Iron Mask
777 "But who is he?" 0.393689 "Who is he?" The Count Of Monte Cristo
Wall time: 14min 21s

In [13]:
close_matches[:200].Title.value_counts()


Out[13]:
The Count Of Monte Cristo                   63
The Man In The Iron Mask                    19
A Study In Scarlet                          17
The Three Musketeers                        13
A Tale Of Two Cities                        12
The Works Of Edgar Allan Poe                11
Around The World In 80 Days                 8 
Great Expectations                          7 
Peter Pan                                   6 
Adventures Of Huckleberry Finn, Complete    6 
Pride And Prejudice                         5 
Dracula                                     4 
Moby Dick; Or The Whale                     3 
Through The Looking-Glass                   3 
The Phantom Of The Opera                    3 
Emma                                        3 
A Journey To The Centre Of The Earth        3 
Notre-Dame De Paris                         2 
The Adventures Of Tom Sawyer, Complete      2 
The Invisible Man                           2 
Wuthering Heights                           1 
The Picture Of Dorian Gray                  1 
The War Of The Worlds                       1 
The Island Of Doctor Moreau                 1 
Name: Title, dtype: int64

Compare all paragraphs in all books to all books

Number of comparisons > 13 billion


In [18]:
%%time
def compare_all_books(books, n_close_matches=20):
    vectorizer = TfidfVectorizer(max_df=.7, min_df=.0001, tokenizer=LemmaTokenizer()).fit(books.contents)
    vect_book = vectorizer.transform(books.contents)
    
    results = {"book_1_title":[],
               "book_1_paragraph":[],
               "book_2_title":[],
               "book_2_paragraph":[],
               "paragraph_distance":[]}
    
    book_titles = books.title.dropna().unique().tolist()
    
    for book_title in book_titles:
        book_mask = (books.title == book_title).values
        other_book_mask = ~book_mask
        
        nbrs = NearestNeighbors(n_neighbors=1, algorithm='brute').fit(vect_book[other_book_mask])
        distances, indices = nbrs.kneighbors(vect_book[book_mask])
        
        book_content = books.loc[book_mask, "contents"].tolist()
        results["book_1_paragraph"].extend(book_content)
        
        matches = books[other_book_mask].contents.values[indices.flatten()]
        results["book_2_paragraph"].extend(matches)
        
        book_1_title = [book_title] * sum(book_mask)
        results["book_1_title"].extend(book_1_title)
        
        book_2_title = books[other_book_mask].title.values[indices.flatten()]
        results["book_2_title"].extend(book_2_title)
        
        results["paragraph_distance"].extend(distances.flatten())

    results_sorted_by_distance = pd.DataFrame(results).sort_values("paragraph_distance")
    
    return results_sorted_by_distance

results = compare_all_books(books)


Wall time: 28min 44s

In [55]:
results[results.paragraph_distance == 0].drop_duplicates()[:100]


Out[55]:
book_1_paragraph book_1_title book_2_paragraph book_2_title paragraph_distance
114317 THE END The Complete Works Of William Shakespeare THE END Dracula 0
8445 "Certainly." The Count Of Monte Cristo "Certainly." The Man In The Iron Mask 0
8430 "Probably." The Count Of Monte Cristo "Probably." The Man In The Iron Mask 0
45108 "Whom?" The Phantom Of The Opera "Whom?" The Three Musketeers 0
8425 "Are you sure of it?" The Count Of Monte Cristo "Are you sure of it?" The Three Musketeers 0
19244 "Exactly." The Three Musketeers "Exactly." The Count Of Monte Cristo 0
45146 "Co-ack!" The Phantom Of The Opera "Shelled!" The Works Of Edgar Allan Poe 0
8411 "How do you know?" The Count Of Monte Cristo "How do you know?" The Phantom Of The Opera 0
69129 "Never." A Journey To The Centre Of The Earth "Never." The Count Of Monte Cristo 0
8406 "Yes." The Count Of Monte Cristo "Yes." The Man In The Iron Mask 0
8450 "Well?" The Count Of Monte Cristo "Well?" The Man In The Iron Mask 0
69101 [Illustration: Runic Glyphs] A Journey To The Centre Of The Earth [Illustration: colophon] Dracula 0
8403 "What do you mean?" The Count Of Monte Cristo "What do you mean?" The Man In The Iron Mask 0
45172 "We don't know." The Phantom Of The Opera "We don't know." The Man In The Iron Mask 0
69034 TABLE OF CONTENTS A Journey To The Centre Of The Earth TABLE OF CONTENTS. Notre-Dame De Paris 0
2720 "Where?" The Count Of Monte Cristo "Where?" The Man In The Iron Mask 0
8389 "No." The Count Of Monte Cristo "No." The Man In The Iron Mask 0
69033 By Jules Verne A Journey To The Centre Of The Earth by Jules Verne None 0
69031 Produced by Norm Wolcott A Journey To The Centre Of The Earth Produced by Dennis Amundson. Dorothy And The Wizard In Oz 0
69024 "What?" Around The World In 80 Days "What?" The Count Of Monte Cristo 0
69090 mm.rnlls esruel seecJde sgtssmf unteief niedrke kt,samn atrateS Saodrrn emtnaeI nuaect rrilSa Atvaar .nscrc ieaabs ccdrmi eeutul frantu dt,iac oseibo KediiY A Journey To The Centre Of The Earth _By_ The Three Musketeers 0
45057 "What did you see?" The Phantom Of The Opera "What did you see?" None 0
45047 "How?" The Phantom Of The Opera "How?" The Count Of Monte Cristo 0
69412 "Efter." A Journey To The Centre Of The Earth "Oliva Corsinari." The Count Of Monte Cristo 0
69386 "What is to be done?" A Journey To The Centre Of The Earth "What is to be done?" Great Expectations 0
69384 "Why?" A Journey To The Centre Of The Earth "Why?" The Count Of Monte Cristo 0
69356 "Why so?" A Journey To The Centre Of The Earth "Why so?" The Count Of Monte Cristo 0
108331 Enter VARRIUS The Complete Works Of William Shakespeare Enter Castigilone. The Works Of Edgar Allan Poe 0
69348 "Exactly so." A Journey To The Centre Of The Earth "Exactly so." The Count Of Monte Cristo 0
8502 "By whom?" The Count Of Monte Cristo "By whom?" A Tale Of Two Cities 0
... ... ... ... ... ...
45371 "Perhaps." The Phantom Of The Opera "Perhaps." The Count Of Monte Cristo 0
68880 "I am." Around The World In 80 Days "I am." The Count Of Monte Cristo 0
45377 "I promise." The Phantom Of The Opera "I promise." The Count Of Monte Cristo 0
45378 "When?" The Phantom Of The Opera "When?" The Count Of Monte Cristo 0
68870 "Yes." Around The World In 80 Days "Yes." The Count Of Monte Cristo 0
45379 "To-morrow." The Phantom Of The Opera "To-morrow." The Count Of Monte Cristo 0
68856 "Agreed." Around The World In 80 Days "Agreed." The Count Of Monte Cristo 0
94799 "Listen!" said he. The Adventures Of Tom Sawyer, Complete "Listen!" he said. The War Of The Worlds 0
8318 "Indeed?" The Count Of Monte Cristo "Indeed?" The Man In The Iron Mask 0
69441 "Et quacunque viam dederit fortuna sequamur." A Journey To The Centre Of The Earth "Justum et tenacem propositi virum." The Count Of Monte Cristo 0
19453 "Bah!" The Three Musketeers "Bah!" The Count Of Monte Cristo 0
44854 "How do you know?" The Phantom Of The Opera "How do you know?" The Count Of Monte Cristo 0
2622 "I swear to you I will." The Count Of Monte Cristo "I will, I swear to you." The Three Musketeers 0
70117 "Certainly." A Journey To The Centre Of The Earth "Certainly." The Count Of Monte Cristo 0
70098 "Do you think so?" A Journey To The Centre Of The Earth "Do you think so?" The Count Of Monte Cristo 0
94195 "Yes." The Adventures Of Tom Sawyer, Complete "Yes." The Count Of Monte Cristo 0
70059 "And what is that?" A Journey To The Centre Of The Earth "And what is that?" The Count Of Monte Cristo 0
18866 "When?" The Three Musketeers "When?" The Count Of Monte Cristo 0
8734 "But what?" The Count Of Monte Cristo "But what?" The Man In The Iron Mask 0
44419 I Metamorphosis I. The Adventures Of Sherlock Holmes 0
108664 SONG. The Complete Works Of William Shakespeare SONG The Works Of Edgar Allan Poe 0
2507 "Precisely." The Count Of Monte Cristo "Precisely." The Man In The Iron Mask 0
44450 II Metamorphosis II. The Adventures Of Sherlock Holmes 0
44480 III Metamorphosis III. The Adventures Of Sherlock Holmes 0
44521 by The Phantom Of The Opera by The Adventures Of Sherlock Holmes 0
8724 "Impossible!" The Count Of Monte Cristo "Impossible!" The Picture Of Dorian Gray 0
44522 Gaston Leroux The Phantom Of The Opera _By_ The Three Musketeers 0
70123 "I am sure of it." A Journey To The Centre Of The Earth "I am sure of it." The Count Of Monte Cristo 0
18818 "I think not." The Three Musketeers "I think not." The Count Of Monte Cristo 0
29580 "Well?" Dracula "Well?" The Count Of Monte Cristo 0

100 rows × 5 columns


In [54]:
results[results.paragraph_distance > 0].drop_duplicates("paragraph_distance")[:100]


Out[54]:
book_1_paragraph book_1_title book_2_paragraph book_2_title paragraph_distance
13890 "Raoul! Raoul!" The Man In The Iron Mask "Raoul! Raoul! Raoul!" The Phantom Of The Opera 0.053526
37674 Produced by David Widger and Carlo Traverso The Works Of Edgar Allan Poe Produced by David Widger Adventures Of Huckleberry Finn, Complete 0.109705
107978 If you discover a Defect in this etext within 90 days of receiv- ing it, you can receive a refund of the money (if any) you paid for it by sending an explanatory note within that time to the person you received it from. If you received it on a physical medium, you must return it with your note, and such person may choose to alternatively give you a replacement copy. If you received it electronically, such person may choose to alternatively give you a second opportunity to receive it electronically. The Complete Works Of William Shakespeare If you discover a Defect in this etext within 90 days of receiving it, you can receive a refund of the money (if any) you paid for it by sending an explanatory note within that time to the person you received it from. If you received it on a physical medium, you must return it with your note, and such person may choose to alternatively give you a replacement copy. If you received it electronically, such person may choose to alternatively give you a second opportunity to receive it electronically. None 0.127986
6588 "Oh, heavens!" The Count Of Monte Cristo "Oh, Heaven! Oh, Heaven!" The Phantom Of The Opera 0.145526
44943 "Raoul!" The Phantom Of The Opera "Raoul! Raoul!" The Man In The Iron Mask 0.153920
56625 Soldiers began then to make on the barrow The largest of dead-fires: dark o'er the vapor The smoke-cloud ascended, the sad-roaring fire, 10 Mingled with weeping (the wind-roar subsided) Till the building of bone it had broken to pieces, Hot in the heart. Heavy in spirit They mood-sad lamented the men-leader's ruin; And mournful measures the much-grieving widow 15 *       *       *       *       *       *       * *       *       *       *       *       *       * *       *       *       *       *       *       * *       *       *       *       *       *       * *       *       *       *       *       *       * 20 *       *       *       *       *       *       * Beowulf * Brucoea ferruginea. The Count Of Monte Cristo 0.154537
97495 THE PREFACE The Picture Of Dorian Gray AUTHOR’S PREFACE The Three Musketeers 0.156077
56317 65 Dragon, to govern, who guarded a treasure, A high-rising stone-cliff, on heath that was grayish: A path 'neath it lay, unknown unto mortals. Some one of earthmen entered the mountain, The heathenish hoard laid hold of with ardor; 70 *       *       *       *       *       *       * *       *       *       *       *       *       * *       *       *       *       *       *       * *       *       *       *       *       *       * *       *       *       *       *       *       * Beowulf * Brucoea ferruginea. The Count Of Monte Cristo 0.156910
104938 BOOK FOURTH. Notre-Dame De Paris The Fourth Book Leviathan 0.158824
36765 "Surely, surely." A Tale Of Two Cities "Surely." None 0.159182
33375 "Thank you. Thank you." Great Expectations "Thank you." The Man In The Iron Mask 0.163576
107021 "What next?" Notre-Dame De Paris "What next? what next?" The Three Musketeers 0.165573
110612 THE PROLOGUE. The Complete Works Of William Shakespeare Prologue The Phantom Of The Opera 0.168098
21386 "Silence!" The Three Musketeers "Silence! silence!" Notre-Dame De Paris 0.170804
103595 The Third Book Leviathan BOOK THIRD. Notre-Dame De Paris 0.175778
101920 THE INTRODUCTION Leviathan INTRODUCTION. The Island Of Doctor Moreau 0.181112
39109 "Yes, yes," I said, "yes, yes." The Works Of Edgar Allan Poe "Yes, yes." The Man In The Iron Mask 0.183521
44577 "Mother! Mother!" The Phantom Of The Opera "Mother!" Peter Pan 0.185424
104390 BOOK SECOND. Notre-Dame De Paris The Second Book Leviathan 0.186028
18980 "Speak!" The Three Musketeers "Speak! speak!" The Man In The Iron Mask 0.188020
42690 'Yes.' Wuthering Heights "'Yes.' A Study In Scarlet 0.188326
44611 "Of course, of course." The Phantom Of The Opera "Of course." The Count Of Monte Cristo 0.190496
14879 "What sort of man is he?" The Man In The Iron Mask "What sort of a man is he?" The Count Of Monte Cristo 0.194105
10523 "Sir! sir!" The Count Of Monte Cristo "Sir!" The Works Of Edgar Allan Poe 0.194482
56323 *       *       *       *       *       *       * He sought of himself who sorely did harm him, But, for need very pressing, the servant of one of The sons of the heroes hate-blows evaded, 5 Seeking for shelter and the sin-driven warrior Took refuge within there. He early looked in it, *       *       *       *       *       *       * *       *       *       *       *       *       * [76] *  *  *  *  *  * when the onset surprised him, Beowulf * Brucoea ferruginea. The Count Of Monte Cristo 0.195603
106566 "_De ventre inferi clamavi, et exaudisti vocem meam_. Notre-Dame De Paris "Justum et tenacem propositi virum." The Count Of Monte Cristo 0.196945
15113 "Yes, a thousand times, yes!" The Man In The Iron Mask "Yes; a thousand times, yes!" The Count Of Monte Cristo 0.198731
54894 "Why? Why?" Youth "Why?" The Count Of Monte Cristo 0.201626
66428 "True, true." Peter Pan "True." The Man In The Iron Mask 0.202981
101882 THE SECOND PART Leviathan SECOND PART None 0.203286
... ... ... ... ... ...
74665 "Carrot ices." War And Peace "Ice, ice, and more ice." None 0.265868
107974 (1) The etext, when displayed, is clearly readable, and does *not* contain characters other than those intended by the author of the work, although tilde (~), asterisk (*) and underline (_) characters may be used to convey punctuation intended by the author, and additional characters may be used to indicate hypertext links; OR The Complete Works Of William Shakespeare [*] The etext, when displayed, is clearly readable, and does *not* contain characters other than those intended by the author of the work, although tilde (~), asterisk (*) and underline (_) characters may be used to convey punctuation intended by the author, and additional characters may be used to indicate hypertext links; OR None 0.266594
3455 "The deuce." The Count Of Monte Cristo "The deuce!" The Man In The Iron Mask 0.271376
48253 "Why not? Moby Dick; Or The Whale "Why not?" The Count Of Monte Cristo 0.271485
27288 "But what will you do?" The Adventures Of Sherlock Holmes "But you, what will you do?" The Man In The Iron Mask 0.271746
24264 "How do you know that?" The Three Musketeers "How do you know?" The Count Of Monte Cristo 0.272952
16618 *Alexandre Dumas, Pere* The Three Musketeers by Alexandre Dumas, Pere The Count Of Monte Cristo 0.274533
36704 "How do you know it?" A Tale Of Two Cities "How do you know?" The Count Of Monte Cristo 0.277022
41820 OF ENGLAND The Works Of Edgar Allan Poe Sunderland, England None 0.277707
107188 Mathias Hungadi shook his head. Notre-Dame De Paris He shook his head. A Tale Of Two Cities 0.278924
42377 A DREAM The Works Of Edgar Allan Poe Dreams Leviathan 0.280482
62493 "Yes, ma'am, all." Pride And Prejudice "Yes, ma'am." Great Expectations 0.284228
45747 "I don't understand!" The Phantom Of The Opera "But I don't understand!" None 0.285117
28138 "Yes, the wine-cellar." The Adventures Of Sherlock Holmes "Yes." The Count Of Monte Cristo 0.285540
25809 "And what then?" The Adventures Of Sherlock Holmes "What then?" The Count Of Monte Cristo 0.286767
92638 "Well, guess." Adventures Of Huckleberry Finn, Complete "You guess well." The Count Of Monte Cristo 0.287479
27570 "And is that all?" The Adventures Of Sherlock Holmes "Is that all?" The Man In The Iron Mask 0.290204
17788 "But where are you going?" The Three Musketeers "Where are you going?" A Study In Scarlet 0.291094
3855 "Of whom?" The Count Of Monte Cristo "Whom?" The Three Musketeers 0.294814
23333 "Swear." The Three Musketeers "Swear it." The Count Of Monte Cristo 0.295559
91037 "Yes--indeedy." Adventures Of Huckleberry Finn, Complete "Yes--yes." A Journey To The Centre Of The Earth 0.296599
3100 "And who is Beppo?" The Count Of Monte Cristo "Who is Darius?" The Phantom Of The Opera 0.296669
82013 "But what did you hear?" War And Peace "What did you hear?" Sense And Sensibility 0.296731
11477 "I hear you, monsieur." The Man In The Iron Mask "I hear, messieurs." A Tale Of Two Cities 0.297302
42354 TO ---- The Works Of Edgar Allan Poe 122 --_Nor pierced._ The Iliad Of Homer 0.297489
46478 A silence. The Phantom Of The Opera SILENCE The Works Of Edgar Allan Poe 0.298765
11900 "What did he do, then?" The Man In The Iron Mask "What, then, did Peterson do?" The Adventures Of Sherlock Holmes 0.299088
32133 "Not the least." Great Expectations "Not in the least." The Count Of Monte Cristo 0.299178
91747 Romeo...................................... Mr. Garrick. Adventures Of Huckleberry Finn, Complete Frs. Coupe, thirty-five francs.............................. 35. From Chalons to Lyons you will go on by the steamboat.. 6. From Lyons to Avignon (still by steamboat)............. 16. From Avignon to Marseilles, seven francs............... 7. Expenses on the road, about fifty francs............... 50. Total................................................. 114 frs. The Count Of Monte Cristo 0.299452
65834 "Where, where?" Peter Pan "Where?" The Count Of Monte Cristo 0.301358

100 rows × 5 columns